Vignette for MiniChip package

Michaela Schwaiger

2020-12-08

In this vignette for the MiniChip package we will generate a GRanges object of ChIP-seq peaks for Adnp, as well as a matched Granges object of randomized peaks across the mouse genome. Then we will plot the distribution of Adnp read counts (per Million reads) across the peak summits.

suppressPackageStartupMessages({
  library(MiniChip)
  library(ComplexHeatmap)
  library(GenomicFeatures)
  library(ggplot2)
  library(reshape2)
  library(BSgenome.Mmusculus.UCSC.mm10)
  #library(TxDb.Mmusculus.UCSC.mm10.knownGene)
  library(TxDb.Mmusculus.UCSC.mm10.ensGene)
  library(EnsDb.Mmusculus.v79)
})

Generate a peaks GRanges object.

First we load the narrowPeak output files generated by MACS peak finding for all 3 Adnp ChIP replicates, name the culumns in a meaningful way, and turn them into GRanges objects.

peaks1.d <- read.table(system.file("extdata", "Adnp_rep1_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
peaks2.d <- read.table(system.file("extdata", "Adnp_rep2_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)
peaks3.d <- read.table(system.file("extdata", "Adnp_rep3_chr11_peaks.narrowPeak", package = "MiniChip"),header=FALSE)

names(peaks1.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
names(peaks2.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")
names(peaks3.d) <- c("chr","start","end","name","score","empty","foldchange","pvalue","qvalue","summit")

peaks1 <- makeGRangesFromDataFrame(peaks1.d,
                                   keep.extra.columns=TRUE,
                                   ignore.strand=TRUE,
                                   seqinfo=NULL,
                                   seqnames.field=c("chr"),
                                   start.field=c("start"),
                                   end.field=c("end"),
                                   starts.in.df.are.0based=TRUE)

peaks2 <- makeGRangesFromDataFrame(peaks2.d,
                                   keep.extra.columns=TRUE,
                                   ignore.strand=TRUE,
                                   seqinfo=NULL,
                                   seqnames.field=c("chr"),
                                   start.field=c("start"),
                                   end.field=c("end"),
                                   starts.in.df.are.0based=TRUE)

peaks3 <- makeGRangesFromDataFrame(peaks3.d,
                                   keep.extra.columns=TRUE,
                                   ignore.strand=TRUE,
                                   seqinfo=NULL,
                                   seqnames.field=c("chr"),
                                   start.field=c("start"),
                                   end.field=c("end"),
                                   starts.in.df.are.0based=TRUE)

Then we select only the peaks which occur in at least 2 of the 3 replicates using the GRanges subsetByOverlaps and overlapsAny functions.

# peaks that occur in at least 2/3 replicates
peaks3.1 <- subsetByOverlaps(peaks3,peaks1)
peaks3.2 <- subsetByOverlaps(peaks3,peaks2)
peaks1.2 <- subsetByOverlaps(peaks1,peaks2)

#keep all 3.1 peaks, find 3.2 and 1.2 peaks that do not overlap 3.1 peaks

# peaks which are found in 3 and 2, but not in 3 and 1, and not in 1 and 2
peaks3.2u <- peaks3.2[overlapsAny(peaks3.2,peaks3.1) ==FALSE & overlapsAny(peaks3.2,peaks1.2) == FALSE]
# peaks which are found in 1 and 2, but not in 3 and 1, and not in 3 and 2
peaks1.2u <- peaks1.2[overlapsAny(peaks1.2,peaks3.1) ==FALSE & overlapsAny(peaks1.2,peaks3.2) == FALSE]
# add together all 3.1 peaks as well as unique 3.2 and 1.2 peaks
peaks <- c(peaks3.1,peaks3.2u,peaks1.2u)

We can now generate a peaknames object describing the peaks GRanges object we have just selected (for later use), and look at the peaks GRanges object to make sure it countains seqnames (chromosomes), start, end, strand, summit, and name filedds, as required by many functions in the MiniChip package.

peaknames <- "Adnp_peaks"
peaks
#> GRanges object with 1453 ranges and 7 metadata columns:
#>          seqnames              ranges strand |                 name     score
#>             <Rle>           <IRanges>  <Rle> |          <character> <integer>
#>      [1]    chr11     3193357-3193462      * | Adnp_r3_CHIP_peak_93        19
#>      [2]    chr11     3197534-3197624      * | Adnp_r3_CHIP_peak_94        19
#>      [3]    chr11     6413062-6413136      * | Adnp_r3_CHIP_peak_95        32
#>      [4]    chr11     6413385-6413485      * | Adnp_r3_CHIP_peak_96        33
#>      [5]    chr11   48821709-48821764      * | Adnp_r3_CHIP_peak_97        32
#>      ...      ...                 ...    ... .                  ...       ...
#>   [1449]    chr11 121414174-121414488      * |  Adnp_rep1_peak_6526       166
#>   [1450]    chr11 121457873-121458237      * |  Adnp_rep1_peak_6527       221
#>   [1451]    chr11 121675898-121676260      * |  Adnp_rep1_peak_6532       319
#>   [1452]    chr11 121884946-121885189      * |  Adnp_rep1_peak_6534        86
#>   [1453]    chr11 121977958-121978411      * |  Adnp_rep1_peak_6536       185
#>                empty foldchange    pvalue    qvalue    summit
#>          <character>  <numeric> <numeric> <numeric> <integer>
#>      [1]           .    1.95180   6.55661   1.90294        32
#>      [2]           .    2.28840   6.55834   1.90457        22
#>      [3]           .    5.99934   8.18814   3.20789        60
#>      [4]           .    5.10858   8.41215   3.35880        83
#>      [5]           .    6.06879   8.29235   3.29854        32
#>      ...         ...        ...       ...       ...       ...
#>   [1449]           .    7.10540   19.9389  16.69339       142
#>   [1450]           .    6.76789   25.5535  22.11681       169
#>   [1451]           .   10.33513   35.6853  31.95324       210
#>   [1452]           .    4.26805   11.5416   8.67331       178
#>   [1453]           .    7.53603   21.8940  18.57712       164
#>   -------
#>   seqinfo: 1 sequence from an unspecified genome; no seqlengths

Generate a randomly shuffled peaks GRanges object.

Then we use the SimulatePeaks function from the MiniChip package to generate a GRanges object called random.peaks that contains the same number of peaks (of the same lengths) as the original peaks GRanges object, but ech one shuffled to a randomly chosen location in the mouse genome. This is a useful control object to have when comparing the overlap of ChIP peaks with genomic annotations, for example genes or repeat elements.

#  randomize peak locations
nSites <- length(peaks)
peak.widths <- width(peaks)
random.peaks <-  SimulatePeaks(nSites,peak.widths,chromosomeSizes=system.file("extdata", "chrNameLength_mm10_chr11.txt", package = "MiniChip"))

Draw heatmaps of Adnp ChIP-seq read counts across the peak summits (+/- 3025bp).

First, we select the bam files of mapped reads that we want to use from the gbuehler deepSeqRepos….

#select bam files from Adnp experiment
bamFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bam$")
#all.bamFiles <- list.files("/tungstenfs/scratch/gbuehler/deepSeqRepos/bam/", full.names=TRUE,pattern="*bam$")
#all.bamFiles <- grep("test",all.bamFiles,value=TRUE,invert=TRUE)
#bamFiles <- grep("1950F",all.bamFiles,value=TRUE)[1:8]
bamFiles
#> [1] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_ChIP_r1_chr11.bam" 
#> [2] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_ChIP_r2_chr11.bam" 
#> [3] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_ChIP_r3_chr11.bam" 
#> [4] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_Input_r1_chr11.bam"
#> [5] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_Input_r2_chr11.bam"
#> [6] "/tungstenfs/scratch/gbuehler/bioinfo/Rpackages/MiniChip/inst/extdata/Adnp_wt_Input_r3_chr11.bam"

…and extract short but meaningful names for the bam files decribing what has been chipped.

bamNames <- gsub(paste(system.file("extdata", package = "MiniChip"),"/",sep=""),"",bamFiles)
bamNames <- gsub("_chr11.bam","",bamNames)
bamNames
#> [1] "Adnp_wt_ChIP_r1"  "Adnp_wt_ChIP_r2"  "Adnp_wt_ChIP_r3"  "Adnp_wt_Input_r1"
#> [5] "Adnp_wt_Input_r2" "Adnp_wt_Input_r3"

You can select any bamFile that you are interested in, and the number of bamFiles you can choose is unlimited.

Check GC bias.

To check if the enrichments are biased towards GC rich regions, we can use the GCbias function to plot GC content versus the read counts for each alignment file.

GCbias(bamFiles[1:2],bamNames[1:2],pe="none", restrict="chr11",GCprob=TRUE, genome=BSgenome.Mmusculus.UCSC.mm10)

It looks like there is no GC bias in our selected bam files, so we can continue to calculate the heatmaps.

Calculate heatmaps (normalized read counts in windows around peaks).

Now we define the span (distance from peak summit to left and right, in this case 3025bp), and step size for windows across the peaks (in this case 50bp). Furthermore we crate a summits GRanges object that contains the genomic coordinates of the peak summits as s trat and end positions.

Then we use the SummitHeatmap function for the MiniChip package to calculate the counts per million (cpm) reads of each bam file in each window across the summit of each peak in the peaks object.

span <- 3025
step <- 50
summits <- peaks
start(summits) <- start(summits) + summits$summit
end(summits) <- start(summits)
names(summits) <- elementMetadata(summits)[,"name"]

counts <- SummitHeatmap(summits,bamFiles,bamNames,span,step,useCPM=TRUE,readShiftSize=75)
#> 
#>         ==========     _____ _    _ ____  _____  ______          _____  
#>         =====         / ____| |  | |  _ \|  __ \|  ____|   /\   |  __ \ 
#>           =====      | (___ | |  | | |_) | |__) | |__     /  \  | |  | |
#>             ====      \___ \| |  | |  _ <|  _  /|  __|   / /\ \ | |  | |
#>               ====    ____) | |__| | |_) | | \ \| |____ / ____ \| |__| |
#>         ==========   |_____/ \____/|____/|_|  \_\______/_/    \_\_____/
#>        Rsubread 2.4.2
#> 
#> //========================== featureCounts setting ===========================\\
#> ||                                                                            ||
#> ||             Input files : 6 BAM files                                      ||
#> ||                                                                            ||
#> ||                           Adnp_wt_ChIP_r1_chr11.bam                        ||
#> ||                           Adnp_wt_ChIP_r2_chr11.bam                        ||
#> ||                           Adnp_wt_ChIP_r3_chr11.bam                        ||
#> ||                           Adnp_wt_Input_r1_chr11.bam                       ||
#> ||                           Adnp_wt_Input_r2_chr11.bam                       ||
#> ||                           Adnp_wt_Input_r3_chr11.bam                       ||
#> ||                                                                            ||
#> ||              Paired-end : no                                               ||
#> ||        Count read pairs : no                                               ||
#> ||              Annotation : R data.frame                                     ||
#> ||      Dir for temp files : .                                                ||
#> ||                 Threads : 20                                               ||
#> ||                   Level : feature level                                    ||
#> ||      Multimapping reads : not counted                                      ||
#> || Multi-overlapping reads : counted                                          ||
#> ||   Min overlapping bases : 1                                                ||
#> ||              Read shift : 75 to downstream                                 ||
#> ||          Read reduction : to 5' end                                        ||
#> ||                                                                            ||
#> \\============================================================================//
#> 
#> //================================= Running ==================================\\
#> ||                                                                            ||
#> || Load annotation file .Rsubread_UserProvidedAnnotation_pid194571 ...        ||
#> ||    Features : 175813                                                       ||
#> ||    Meta-features : 1453                                                    ||
#> ||    Chromosomes/contigs : 1                                                 ||
#> ||                                                                            ||
#> || Process BAM file Adnp_wt_ChIP_r1_chr11.bam...                              ||
#> ||    Single-end reads are included.                                          ||
#> ||    Total alignments : 1668024                                              ||
#> ||    Successfully assigned alignments : 245513 (14.7%)                       ||
#> ||    Running time : 0.01 minutes                                             ||
#> ||                                                                            ||
#> || Process BAM file Adnp_wt_ChIP_r2_chr11.bam...                              ||
#> ||    Single-end reads are included.                                          ||
#> ||    Total alignments : 1466194                                              ||
#> ||    Successfully assigned alignments : 193558 (13.2%)                       ||
#> ||    Running time : 0.01 minutes                                             ||
#> ||                                                                            ||
#> || Process BAM file Adnp_wt_ChIP_r3_chr11.bam...                              ||
#> ||    Single-end reads are included.                                          ||
#> ||    Total alignments : 1899151                                              ||
#> ||    Successfully assigned alignments : 201429 (10.6%)                       ||
#> ||    Running time : 0.01 minutes                                             ||
#> ||                                                                            ||
#> || Process BAM file Adnp_wt_Input_r1_chr11.bam...                             ||
#> ||    Single-end reads are included.                                          ||
#> ||    Total alignments : 1750427                                              ||
#> ||    Successfully assigned alignments : 151957 (8.7%)                        ||
#> ||    Running time : 0.01 minutes                                             ||
#> ||                                                                            ||
#> || Process BAM file Adnp_wt_Input_r2_chr11.bam...                             ||
#> ||    Single-end reads are included.                                          ||
#> ||    Total alignments : 1570400                                              ||
#> ||    Successfully assigned alignments : 133978 (8.5%)                        ||
#> ||    Running time : 0.01 minutes                                             ||
#> ||                                                                            ||
#> || Process BAM file Adnp_wt_Input_r3_chr11.bam...                             ||
#> ||    Single-end reads are included.                                          ||
#> ||    Total alignments : 1576344                                              ||
#> ||    Successfully assigned alignments : 127959 (8.1%)                        ||
#> ||    Running time : 0.01 minutes                                             ||
#> ||                                                                            ||
#> || Write the final count table.                                               ||
#> || Write the read assignment summary.                                         ||
#> ||                                                                            ||
#> \\============================================================================//

This will return the counts object, a list of length 8 (since we are using 8 bam files), where each element of the list contains a matrix of cpm values across all windows for all peaks. If you want to also save a seperately clustered heatmap for each bam file to a file, set plotHM=TRUE.

To plot heatmaps for each sample (bam file) sorted by the first bam file, you can use the DrawSummitHeatmaps function. By providing shorter names for the samples you can ensure nice heatmap titles. Here we select only the ChIP samples (and not the Input samples).

ht_list <- DrawSummitHeatmaps(counts[c(1,2,3)], bamNames[c(1,2,3)],orderSample=1, use.log=TRUE,
                              bottomCpm=c(0,0,0), medianCpm = c(2,2,2), topCpm = c(4,4,4), orderWindows=2, TargetHeight=500)
draw(ht_list, padding = unit(c(3, 8, 8, 2), "mm"))

Draw cumulative plots of Adnp ChIP-seq and Input read counts across the peak summits (+/- 3025bp).

First we will calculate the mean counts per million (cpm) values for the three replicates of Adnp Chip and Input using the SummarizeHeatmaps function.

inx.ChIP <- grep("ChIP",names(counts))
Adnp <- grep("Adnp",names(counts[inx.ChIP]),value=TRUE)
Input <- grep("Input",names(counts),value=TRUE)[1:3]

sampleList <- list(Adnp=Adnp,Input=Input)
meanCounts <- SummarizeHeatmaps(counts,sampleList)
#> [1] "summarizing group Adnp"
#> [1] "summarizing group Input"

Then we can split the epaks into those that overlap a promoter and those that do not, and generate the cumululative plots using the CumulativePlots function.

#select peaks that overlap a TSS (+/- 1kb)
#txdb=loadDb("/tungstenfs/scratch/gbuehler/michi/Annotations/GENCODE/Mouse/release_M23/gencode.vM23.annotation.txdb.sqlite")
txdb=TxDb.Mmusculus.UCSC.mm10.ensGene
TSSs <- promoters(txdb,upstream=1000,downstream=1000)
#> Warning in valid.GenomicRanges.seqinfo(x, suggest.trim = TRUE): GRanges object contains 1 out-of-bound range located on sequence
#>   chr4_JH584295_random. Note that ranges located on a sequence whose
#>   length is unknown (NA) or on a circular sequence are not considered
#>   out-of-bound (use seqlengths() and isCircular() to get the lengths and
#>   circularity flags of the underlying sequences). You can use trim() to
#>   trim these ranges. See ?`trim,GenomicRanges-method` for more
#>   information.
summit_TSS_overlap <- overlapsAny(summits,TSSs)
summit_TSS_overlap_names <- names(summits[summit_TSS_overlap == TRUE])

#cumulative plots
CumulativePlots(meanCounts,bamNames = names(meanCounts),
                                   span=3025,step=50,summarizing = "mean",overlapNames = summit_TSS_overlap_names)

This suggests that some of the Adnp peaks overlap promoters, and that they have as much enrichment in Adnp as the ones which do not overlap promoters. To see which ones overlap promoters, we can add the promoter annotation to the heatmaps, using the AnnotationHeatmap function.

Draw heatmaps of Adnp ChIP-seq and Input read counts and promoter annotation around the peak summits (+/- 3025bp).

promoters <- promoters(txdb,upstream=150,downstream=150)
annoname <- "promoters"
promoters.overlap <- AnnotationHeatmap(summits,annotation=promoters,annoname,span,step)
annos <- list(promoters.overlap)
names(annos) <- c("promoters")
allCounts <- c(meanCounts,annos)

cols <- c("darkblue","darkred","darkgreen")
upper.cpm <- c(rep(2,2),rep(1,1))
splitHM <- ifelse(summit_TSS_overlap==TRUE,"TSS","no TSS")

heatmap_list <- DrawSummitHeatmaps(allCounts, names(allCounts), plotcol= cols, medianCpm = upper.cpm, orderSample = 1, use.log=TRUE, summarizing = "mean", orderWindows=2,MetaScale=c("all","all","individual"), TargetHeight=500,
                                   splitHM=splitHM)
draw(heatmap_list, padding = unit(c(3, 8, 8, 2), "mm"),show_heatmap_legend=FALSE)

Draw genome browser tracks of Adnp ChIP read counts across the Adnp peak with the highest score.

#get bigwig file names
#all.bwFiles <- list.files("/tungstenfs/scratch/gbuehler/deepSeqRepos/bigwig/", full.names=TRUE,pattern="*bw$")
#all.bwFiles <- grep("uni",all.bwFiles,value=TRUE)
#bwFiles <- grep("872F",all.bwFiles,value=TRUE)[1:2]
bwFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bw$")
bedFiles <- list.files(system.file("extdata", package = "MiniChip"), full.names=TRUE,pattern="*bed$")

#plot
plotTracks(bwFilesPlus=bwFiles,bwNames=c(rep("Adnp",2)),txdb=txdb,EnsDb=EnsDb.Mmusculus.v79,bedFiles=bedFiles,
           plotregion=peaks[peaks$score==max(peaks$score)],plotExtension=5000,plotranges=rep(0.8,2))
#> [1] "yes"
#> [1] "yes"